{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4885844e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from graphdatascience import GraphDataScience\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_colwidth\", None)\n",
    "\n",
    "host = \"bolt://localhost:7687\"\n",
    "user = \"neo4j\"\n",
    "password = \"letmein\"\n",
    "\n",
    "gds = GraphDataScience(host, auth=(user, password))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "faa333ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>labels</th>\n",
       "      <th>relTypesCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>{'Keyword': 4198, 'Page': 15370}</td>\n",
       "      <td>{'LINKS_TO': 62365, 'HAS_KEYWORD': 26851, 'REDIRECTS': 723}</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             labels  \\\n",
       "0  {'Keyword': 4198, 'Page': 15370}   \n",
       "\n",
       "                                                 relTypesCount  \n",
       "0  {'LINKS_TO': 62365, 'HAS_KEYWORD': 26851, 'REDIRECTS': 723}  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\n",
    "    \"\"\"\n",
    "CALL apoc.meta.stats()\n",
    "YIELD labels, relTypesCount\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c56b6b90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>has_text</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>True</td>\n",
       "      <td>9688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>2972</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>None</td>\n",
       "      <td>2710</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  has_text  count\n",
       "0     True   9688\n",
       "1    False   2972\n",
       "2     None   2710"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\n",
    "    \"\"\"\n",
    "MATCH (p:Page)\n",
    "RETURN p.has_text AS has_text,\n",
    "       count(*) AS count\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "56082692",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>page</th>\n",
       "      <th>links</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>http://localhost:7474</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/4.3.0.12</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/4.4.0.12</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/text-analytics-sdk</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                      page  \\\n",
       "0                                                                                    http://localhost:7474   \n",
       "1                             https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/4.3.0.12   \n",
       "2                             https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/4.4.0.12   \n",
       "3  https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/text-analytics-sdk   \n",
       "4                         https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html   \n",
       "\n",
       "   links  \n",
       "0     38  \n",
       "1     38  \n",
       "2     37  \n",
       "3     37  \n",
       "4     37  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\n",
    "    \"\"\"\n",
    "MATCH (p:Page)\n",
    "WHERE p.has_text IS NULL\n",
    "RETURN p.url AS page,\n",
    "       count{(p)<-[:LINKS_TO|REDIRECTS]-()} AS links\n",
    "ORDER BY links DESC\n",
    "LIMIT 5\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "687d9013",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>brokenLinkCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>241</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   brokenLinkCount\n",
       "0              241"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\n",
    "    \"\"\"\n",
    "MATCH (:Page)-[:LINKS_TO|REDIRECTS]->(:Page{is_404:true})\n",
    "RETURN count(*) AS brokenLinkCount\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "07b2a43b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>path</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[https://neo4j.com/docs, https://neo4j.com/docs/aura/auradb, https://neo4j.com/docs/aura/auradb/getting-started/create-database, https://console.neo4j.io]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                         path\n",
       "0  [https://neo4j.com/docs, https://neo4j.com/docs/aura/auradb, https://neo4j.com/docs/aura/auradb/getting-started/create-database, https://console.neo4j.io]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\n",
    "    \"\"\"\n",
    "MATCH (start:Page {url:\"https://neo4j.com/docs\"}), \n",
    "      (end:Page {url:\"https://console.neo4j.io\"})\n",
    "MATCH p=shortestPath((start)-[:LINKS_TO|REDIRECTS*..10]->(end))\n",
    "RETURN [n in nodes(p) | n.url] AS path\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cdbf9ebb",
   "metadata": {},
   "outputs": [],
   "source": [
    "G, metadata = gds.graph.project(\"structure\", \"Page\", [\"LINKS_TO\", \"REDIRECTS\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "89036e07",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nodeId</th>\n",
       "      <th>score</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4261</th>\n",
       "      <td>6174</td>\n",
       "      <td>598.0</td>\n",
       "      <td>https://neo4j.com/developer/kb</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8571</th>\n",
       "      <td>11257</td>\n",
       "      <td>391.0</td>\n",
       "      <td>https://neo4j.com/developer-blog/tagged/neo4j</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8924</th>\n",
       "      <td>11610</td>\n",
       "      <td>235.0</td>\n",
       "      <td>https://neo4j.com/developer-blog/tagged/graph-database</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2120</th>\n",
       "      <td>3040</td>\n",
       "      <td>165.0</td>\n",
       "      <td>https://neo4j.com/graphgists/categories/web-amp-social</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2114</th>\n",
       "      <td>3034</td>\n",
       "      <td>165.0</td>\n",
       "      <td>https://neo4j.com/graphgists/categories/finance</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      nodeId  score                                                     url\n",
       "4261    6174  598.0                          https://neo4j.com/developer/kb\n",
       "8571   11257  391.0           https://neo4j.com/developer-blog/tagged/neo4j\n",
       "8924   11610  235.0  https://neo4j.com/developer-blog/tagged/graph-database\n",
       "2120    3040  165.0  https://neo4j.com/graphgists/categories/web-amp-social\n",
       "2114    3034  165.0         https://neo4j.com/graphgists/categories/finance"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = gds.degree.stream(G, orientation=\"REVERSE\")\n",
    "df[\"url\"] = [d[\"url\"] for d in gds.util.asNodes(df[\"nodeId\"].to_list())]\n",
    "df.sort_values(\"score\", ascending=False, inplace=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "048a26fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "pr_df = gds.pageRank.stream(G)\n",
    "pr_df[\"pagerank\"] = pr_df.pop(\"score\")\n",
    "combined_df = df.merge(pr_df, on=\"nodeId\")\n",
    "combined_df.sort_values(\"pagerank\", ascending=False, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "aafb07b1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nodeId</th>\n",
       "      <th>score</th>\n",
       "      <th>url</th>\n",
       "      <th>pagerank</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6174</td>\n",
       "      <td>598.0</td>\n",
       "      <td>https://neo4j.com/developer/kb</td>\n",
       "      <td>38.593852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>13541</td>\n",
       "      <td>55.0</td>\n",
       "      <td>https://neo4j.com/graphconnect-2018</td>\n",
       "      <td>21.989673</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>232</th>\n",
       "      <td>14770</td>\n",
       "      <td>21.0</td>\n",
       "      <td>https://neo4j.com/labs/apoc/4.4/graph-querying/node-querying</td>\n",
       "      <td>12.607568</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>242</th>\n",
       "      <td>18265</td>\n",
       "      <td>21.0</td>\n",
       "      <td>https://neo4j.com/labs/apoc/4.3/graph-querying/node-querying</td>\n",
       "      <td>12.420046</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>856</td>\n",
       "      <td>70.0</td>\n",
       "      <td>https://neo4j.com/docs/operations-manual/5/reference/configuration-settings</td>\n",
       "      <td>12.392406</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     nodeId  score  \\\n",
       "0      6174  598.0   \n",
       "43    13541   55.0   \n",
       "232   14770   21.0   \n",
       "242   18265   21.0   \n",
       "25      856   70.0   \n",
       "\n",
       "                                                                             url  \\\n",
       "0                                                 https://neo4j.com/developer/kb   \n",
       "43                                           https://neo4j.com/graphconnect-2018   \n",
       "232                 https://neo4j.com/labs/apoc/4.4/graph-querying/node-querying   \n",
       "242                 https://neo4j.com/labs/apoc/4.3/graph-querying/node-querying   \n",
       "25   https://neo4j.com/docs/operations-manual/5/reference/configuration-settings   \n",
       "\n",
       "      pagerank  \n",
       "0    38.593852  \n",
       "43   21.989673  \n",
       "232  12.607568  \n",
       "242  12.420046  \n",
       "25   12.392406  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ebecbd3c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "graphName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 structure\n",
       "database                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      neo4j\n",
       "memoryUsage                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        \n",
       "sizeInBytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      -1\n",
       "nodeCount                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     15370\n",
       "relationshipCount                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             63088\n",
       "configuration        {'relationshipProjection': {'LINKS_TO': {'orientation': 'NATURAL', 'aggregation': 'DEFAULT', 'type': 'LINKS_TO', 'properties': {}}, 'REDIRECTS': {'orientation': 'NATURAL', 'aggregation': 'DEFAULT', 'type': 'REDIRECTS', 'properties': {}}}, 'jobId': 'aadd6fc4-d7f8-4515-8f3f-78d0b5565782', 'nodeProjection': {'Page': {'label': 'Page', 'properties': {}}}, 'relationshipProperties': {}, 'creationTime': 2023-01-03T11:11:35.412203427+01:00, 'validateRelationships': False, 'readConcurrency': 4, 'sudo': False, 'nodeProperties': {}}\n",
       "density                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    0.000267\n",
       "creationTime                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    2023-01-03T11:11:35.412203427+01:00\n",
       "modificationTime                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                2023-01-03T11:11:35.438605155+01:00\n",
       "schema                                                                                                                                                                                                                                                                                                                                                                                                                                                           {'graphProperties': {}, 'relationships': {'LINKS_TO': {}, 'REDIRECTS': {}}, 'nodes': {'Page': {}}}\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "G.drop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5cbd1daa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>keyword</th>\n",
       "      <th>mentions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>node</td>\n",
       "      <td>1194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>neo4j</td>\n",
       "      <td>983</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>clipboard</td>\n",
       "      <td>868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>graph</td>\n",
       "      <td>596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>java</td>\n",
       "      <td>537</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     keyword  mentions\n",
       "0       node      1194\n",
       "1      neo4j       983\n",
       "2  clipboard       868\n",
       "3      graph       596\n",
       "4       java       537"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\n",
    "    \"\"\"\n",
    "MATCH (k:Keyword)\n",
    "RETURN k.name AS keyword,\n",
    "       count {(k)<-[:HAS_KEYWORD]-()} AS mentions\n",
    "ORDER BY mentions DESC\n",
    "LIMIT 5\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "26af37d2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>keyword</th>\n",
       "      <th>mentions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>graph</td>\n",
       "      <td>259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>algorithm</td>\n",
       "      <td>188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>node</td>\n",
       "      <td>127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>neo4j graph</td>\n",
       "      <td>88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>neo4j</td>\n",
       "      <td>76</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       keyword  mentions\n",
       "0        graph       259\n",
       "1    algorithm       188\n",
       "2         node       127\n",
       "3  neo4j graph        88\n",
       "4        neo4j        76"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\n",
    "    \"\"\"\n",
    "MATCH (p:Page)-[:HAS_KEYWORD]->(k:Keyword)\n",
    "WHERE p.url CONTAINS \"graph-data-science\"\n",
    "RETURN k.name AS keyword,\n",
    "       count(*) AS mentions\n",
    "ORDER BY mentions DESC\n",
    "LIMIT 5\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "8ebcab70",
   "metadata": {},
   "outputs": [],
   "source": [
    "G, metadata = gds.graph.project(\n",
    "    \"keywords\", [\"Page\", \"Keyword\"], {\"HAS_KEYWORD\": {\"orientation\": \"REVERSE\"}}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "664be9dd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "preProcessingMillis                                                                                                                                                                                                                                                                                                                                                                                                      0\n",
       "computeMillis                                                                                                                                                                                                                                                                                                                                                                                                          389\n",
       "mutateMillis                                                                                                                                                                                                                                                                                                                                                                                                             4\n",
       "postProcessingMillis                                                                                                                                                                                                                                                                                                                                                                                                    -1\n",
       "nodesCompared                                                                                                                                                                                                                                                                                                                                                                                                         4198\n",
       "relationshipsWritten                                                                                                                                                                                                                                                                                                                                                                                                  8870\n",
       "similarityDistribution    {'p1': 0.39999961853027344, 'max': 1.0000057220458984, 'p5': 0.5000019073486328, 'p90': 1.0000057220458984, 'p50': 1.0000057220458984, 'p95': 1.0000057220458984, 'p10': 0.5000019073486328, 'p75': 1.0000057220458984, 'p99': 1.0000057220458984, 'p25': 0.5000019073486328, 'p100': 1.0000057220458984, 'min': 0.39999961853027344, 'mean': 0.8378232607578076, 'stdDev': 0.22690173398077387}\n",
       "configuration                                                                          {'topK': 10, 'similarityMetric': 'JACCARD', 'bottomK': 10, 'bottomN': 0, 'mutateRelationshipType': 'CO_OCCUR', 'topN': 0, 'concurrency': 4, 'jobId': 'd0a056f5-bfcd-4eab-acd7-9510b86ed137', 'degreeCutoff': 1, 'similarityCutoff': 0.4, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'score'}\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.nodeSimilarity.mutate(\n",
    "    G, mutateRelationshipType=\"CO_OCCUR\", mutateProperty=\"score\", similarityCutoff=0.4\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "afe6e544",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>communityId</th>\n",
       "      <th colspan=\"2\" halign=\"left\">keyword</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>size</th>\n",
       "      <th>list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>634</th>\n",
       "      <td>1250</td>\n",
       "      <td>46</td>\n",
       "      <td>[santa, chewbacca, galaxy, republic, jedi, christmas eve, new year ’ s day, gandhi jayanti dussehra, caribbean iii, ganesh chaturthi, thanksgiving, makar sankranti republic day, vijayadashami, diwali padwa veterans day, labor day, maha shivratri holi memorial day, christmas day, independence day, boxing day, new year ’ s eve, diwali, president ’ day, martin luther king day, good friday, caribbean, christmas, veterans day, thanksgiving day, maha shivratri, king iii, president ’, padwa, holi memorial day, king iii holiday, reindeer, christmas tree, santa claus, street map, rome, android, tatooine, wookiepedia, alderaan, jorge albarran, santavin, luke skywalker]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1749</th>\n",
       "      <td>3181</td>\n",
       "      <td>35</td>\n",
       "      <td>[mar cabra, mossack fonseca, tax haven, panama, panama papers, apache solr, tika,  prizesion, offshore leaks, oxwall source, zurich switzerland, paul kuhne, zurich, swiss leaks, hsbc, hsbc leaks, excel source, president of azerbaijan, panamapaper, mahabharataa, president, data science analytics, quantum analytics, azerbaijan, firepower scandal, cablegate, pentagon papers, ilham aliyev, the case, duncan campbell, baku, heydar aliyev, azerbaijan airlines, dubai, azerfo]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2212</th>\n",
       "      <td>4063</td>\n",
       "      <td>31</td>\n",
       "      <td>[reddit, avengers : infinity war, evelina gabasova, seattle company, craig walls, pokemon, pokemonpher, tom hiddleston, tom hidton, avengers, the avengers, x - men, node classification learning, marvel universe, hulk, spider - man, thanosbra, s . h . i . e . l ., nick fury, black panther, loki, hawkeye, guardians of the galaxy, thanos, infinity stones, marvel cinematic universe, iron man, captain america, doctor strange, thor, natasha romanoff]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1712</th>\n",
       "      <td>3074</td>\n",
       "      <td>31</td>\n",
       "      <td>[netcon, salesforce, anders ekstrom, netconsult, sql database, search engine, musicology, chief technical officer, streaming services, music industry, the orchard, the orchard444j, music distribution, jeremy davies, webflow, stephen o ' grady, media, state, manhattan, marriott marquis times square, tim hanssen, pat patterson, cloudera, streamsetser, hilary mason, jake graham, lauren shin, artificial, change data capture, redmonk, redmon]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1551</th>\n",
       "      <td>2749</td>\n",
       "      <td>27</td>\n",
       "      <td>[maharashtra, airtel, tata, vodafone, mobile operator, gujarat, rajasthan, centurylink, business conglomerate, prodapt, at &amp; t, jhaver group, deutsche telekom, verizon, liberty global, windstream, adtran, virgin media, ebay, forresterner, comcast, fortune 100, accorhotels, jpmorgan chase, atpco, forrester, adeo]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     communityId keyword  \\\n",
       "                    size   \n",
       "634         1250      46   \n",
       "1749        3181      35   \n",
       "2212        4063      31   \n",
       "1712        3074      31   \n",
       "1551        2749      27   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   \n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             list  \n",
       "634   [santa, chewbacca, galaxy, republic, jedi, christmas eve, new year ’ s day, gandhi jayanti dussehra, caribbean iii, ganesh chaturthi, thanksgiving, makar sankranti republic day, vijayadashami, diwali padwa veterans day, labor day, maha shivratri holi memorial day, christmas day, independence day, boxing day, new year ’ s eve, diwali, president ’ day, martin luther king day, good friday, caribbean, christmas, veterans day, thanksgiving day, maha shivratri, king iii, president ’, padwa, holi memorial day, king iii holiday, reindeer, christmas tree, santa claus, street map, rome, android, tatooine, wookiepedia, alderaan, jorge albarran, santavin, luke skywalker]  \n",
       "1749                                                                                                                                                                                                     [mar cabra, mossack fonseca, tax haven, panama, panama papers, apache solr, tika,  prizesion, offshore leaks, oxwall source, zurich switzerland, paul kuhne, zurich, swiss leaks, hsbc, hsbc leaks, excel source, president of azerbaijan, panamapaper, mahabharataa, president, data science analytics, quantum analytics, azerbaijan, firepower scandal, cablegate, pentagon papers, ilham aliyev, the case, duncan campbell, baku, heydar aliyev, azerbaijan airlines, dubai, azerfo]  \n",
       "2212                                                                                                                                                                                                                             [reddit, avengers : infinity war, evelina gabasova, seattle company, craig walls, pokemon, pokemonpher, tom hiddleston, tom hidton, avengers, the avengers, x - men, node classification learning, marvel universe, hulk, spider - man, thanosbra, s . h . i . e . l ., nick fury, black panther, loki, hawkeye, guardians of the galaxy, thanos, infinity stones, marvel cinematic universe, iron man, captain america, doctor strange, thor, natasha romanoff]  \n",
       "1712                                                                                                                                                                                                                                    [netcon, salesforce, anders ekstrom, netconsult, sql database, search engine, musicology, chief technical officer, streaming services, music industry, the orchard, the orchard444j, music distribution, jeremy davies, webflow, stephen o ' grady, media, state, manhattan, marriott marquis times square, tim hanssen, pat patterson, cloudera, streamsetser, hilary mason, jake graham, lauren shin, artificial, change data capture, redmonk, redmon]  \n",
       "1551                                                                                                                                                                                                                                                                                                                                                                    [maharashtra, airtel, tata, vodafone, mobile operator, gujarat, rajasthan, centurylink, business conglomerate, prodapt, at & t, jhaver group, deutsche telekom, verizon, liberty global, windstream, adtran, virgin media, ebay, forresterner, comcast, fortune 100, accorhotels, jpmorgan chase, atpco, forrester, adeo]  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topic_df = gds.louvain.stream(G, nodeLabels=[\"Keyword\"], relationshipTypes=[\"CO_OCCUR\"])\n",
    "topic_df[\"keyword\"] = [\n",
    "    n[\"name\"] for n in gds.util.asNodes(topic_df[\"nodeId\"].to_list())\n",
    "]\n",
    "topic_df.groupby(\"communityId\").agg(\n",
    "    {\"keyword\": [\"size\", list]}\n",
    ").reset_index().sort_values([(\"keyword\", \"size\")], ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0bd9276d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "graphName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 keywords\n",
       "database                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     neo4j\n",
       "memoryUsage                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       \n",
       "sizeInBytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     -1\n",
       "nodeCount                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    19568\n",
       "relationshipCount                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            35721\n",
       "configuration        {'relationshipProjection': {'HAS_KEYWORD': {'orientation': 'REVERSE', 'aggregation': 'DEFAULT', 'type': 'HAS_KEYWORD', 'properties': {}}}, 'jobId': '0cb8c108-fcbb-4401-90c7-0f82fe0631df', 'nodeProjection': {'Keyword': {'label': 'Keyword', 'properties': {}}, 'Page': {'label': 'Page', 'properties': {}}}, 'relationshipProperties': {}, 'creationTime': 2023-01-03T11:11:45.450356512+01:00, 'validateRelationships': False, 'readConcurrency': 4, 'sudo': False, 'nodeProperties': {}}\n",
       "density                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   0.000093\n",
       "creationTime                                                                                                                                                                                                                                                                                                                                                                                                                                                                   2023-01-03T11:11:45.450356512+01:00\n",
       "modificationTime                                                                                                                                                                                                                                                                                                                                                                                                                                                               2023-01-03T11:11:45.891751925+01:00\n",
       "schema                                                                                                                                                                                                                                                                                                                        {'graphProperties': {}, 'relationships': {'CO_OCCUR': {'score': 'Float (DefaultValue(NaN), TRANSIENT, Aggregation.NONE)'}, 'HAS_KEYWORD': {}}, 'nodes': {'Keyword': {}, 'Page': {}}}\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "G.drop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11c44c1f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
